import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import sqlite3
from textblob import TextBlob
from wordcloud import WordCloud,STOPWORDS
import re
import string
import nltk
from nltk.corpus import stopwords
con = sqlite3.connect(r'C:\Users\rishi\Downloads\Projects\Amazon Customer Analysis\drive-download-20210301T164144Z-001\database.sqlite')
df = pd.read_sql_query('SELECT * FROM Reviews', con)
df.head()
df.shape
pd.read_sql_query('SELECT * FROM Reviews LIMIT 3', con)
text = df['Summary'][0]
TextBlob(text).sentiment.polarity
polarity = []
for i in df['Summary']:
try:
polarity.append(TextBlob(i).sentiment.polarity)
except:
polarity.append(0)
data = df.copy()
data['polarity'] = polarity
len(polarity)
def positive_analysis(data):
data_positive = data[data['polarity']>0]
stopwords = set(STOPWORDS)
total_text = (' '.join(data_positive['Summary']))
total_text_updated = re.sub('[^a-zA-Z]' ,' ', total_text)
total_text_updated = re.sub(' +', ' ',total_text_updated)
wordcloud = WordCloud(width = 1000 , height = 500 , stopwords = stopwords).generate(total_text_updated)
plt.figure(figsize=(15,5))
plt.imshow(wordcloud)
plt.axis ('off')
plt.title('Explorative Data Analysis on Positive Comments')
def negative_analysis(data):
data_negative = data[data['polarity']<0]
stopwords = set(STOPWORDS)
total_text = (' '.join(data_negative['Summary']))
total_text_updated = re.sub('[^a-zA-Z]' ,' ', total_text)
total_text_updated = re.sub(' +', ' ',total_text_updated)
wordcloud2 = WordCloud(width = 1000 , height = 500 , stopwords = stopwords).generate(total_text_updated)
plt.figure(figsize=(15,5))
plt.imshow(wordcloud2)
plt.axis ('off')
plt.title('Explorative Data Analysis on Negative Comments')
positive_analysis(data)
negative_analysis(data)
df['UserId'].nunique()
raw = df.groupby('UserId').agg({'Summary':'count','Text':'count', 'Score':'mean', 'ProductId':'count'}).sort_values(by = 'Text' ,
ascending = False)
raw.columns = ['Number_of_summaries','No_of_text', 'Avg_Score' , 'No_of_products_purchased']
raw
raw['User'] = raw.index
plt.figure(figsize=(20,8))
fig = px.bar(data_frame= raw[0:10] , x = 'User' , y = 'No_of_products_purchased' , labels={'No_of_products_purchased':'No of products purchased'} )
fig.show()
final = df.copy()
final.isnull().sum()
final.duplicated().sum()
def calc_length(text):
return len(text.split(' '))
final['Text_Length'] = final['Text'].apply(calc_length)
plt.figure(figsize=(20,8))
px.box(final , y = 'Text_Length' ,title = 'Length of Comments' )
sns.countplot(final['Score'] , palette="plasma")
plt.title('Scores/Ratings given by the user')
new_final=df[0:2000]
new_final['Text'] = new_final['Text'].str.lower()
def remove_punc(review):
no_punc = ''
for char in review:
if char not in punctuations:
no_punc = no_punc + char
return no_punc
punctuations = string.punctuation
new_final['Text'] = new_final['Text'].apply(remove_punc)
def remove_stopwords(review):
return ' '.join([word for word in review.split(' ') if word not in set(stopwords.words('english'))])
new_final['Text'] = new_final['Text'].apply(remove_stopwords)
new_final['Text'].str.contains('http?').sum()
new_final['Text'].str.contains('http').sum()
def remove_urls(review):
url_pattern = re.compile(r'href|http.\w+')
return url_pattern.sub(r'', review)
new_final['Text'] = new_final['Text'].apply(remove_urls)
for i in range(len(new_final['Text'])):
new_final['Text'][i]=new_final['Text'][i].replace('br','')
stopwords = set(STOPWORDS)
data2=new_final.copy()
comment_words = ''
for val in data2['Text']:
# typecaste each val to string
# split the value
tokens = val.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
comment_words=comment_words+ " ".join(tokens)+" "
wordcloud3 = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(comment_words)
plt.figure(figsize = (8, 8))
plt.imshow(wordcloud3)
plt.axis("off")